from cvxpy import *
import numpy as np 
import time, collections, os, errno, sys
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from Visualization_function import visualize
from solveCrossTime import *
from scipy import stats

from sklearn import mixture
from sklearn import covariance
import sklearn, random
from sklearn.cluster import KMeans

import pandas as pd
pd.set_option('display.max_columns', 500)
np.set_printoptions(formatter={'float': lambda x: "{0:0.4f}".format(x)})
np.random.seed(102)
##PARAMETERS TO PLAY WITH 
cvxpy_iterations = 15300
num_blocks = 3
maxIters = 1 ##number of Iterations of the smoothening + clustering algo
adherence_penalty =50000
switch_penalty = 200
seg_len = 400
lam_sparse = 1e-2

rep_iterations = 2
maxClusters = 10
threshold = 10.5e-5
write_out_file = False ##Only if True are any files outputted
scaling_time = 3
num_stacked = num_blocks - 1

#num_clusters = 4
hexadecimal_color_list = ["cc0000","0000ff","003300","33ff00","00ffcc","ffff00","ff9900","ff00ff","cccc66","666666","ffccff","660000","00ff00","ffffff","3399ff","006666","330000","ff0000","cc99ff","b0800f","3bd9eb","ef3e1b"]


print "num_cluster", maxClusters - 1
print "num_blocks:", num_blocks


Data = 1000*np.random.rand(1e7,50)
Data_pre = Data
UNNORMALIZED_Data = Data*1000
(m,n) = Data.shape
len_D_total = m
size_blocks = n
# def optimize(emp_cov = No)
print "Length of data:", m
print "completed getting the data"

def upper2Full(a, eps = 0):
    ind = (a<eps)&(a>-eps)
    a[ind] = 0
    n = int((-1  + np.sqrt(1+ 8*a.shape[0]))/2)  
    A = np.zeros([n,n])
    A[np.triu_indices(n)] = a 
    temp = A.diagonal()
    A = np.asarray((A + A.T) - np.diag(temp))             
    return A   


def updateClusters(LLE_node_vals,switch_penalty = 1):
	"""
	Takes in LLE_node_vals matrix and computes the path that minimizes
	the total cost over the path
	Note the LLE's are negative of the true LLE's actually!!!!!

	Note: switch penalty > 0
	"""
	(T,num_clusters) = LLE_node_vals.shape
	future_cost_vals = np.zeros(LLE_node_vals.shape)

	##compute future costs
	for i in xrange(T-2,-1,-1):
		j = i+1
		indicator = np.zeros(num_clusters)
		future_costs = future_cost_vals[j,:]
		lle_vals = LLE_node_vals[j,:]
		for cluster in xrange(num_clusters):
			total_vals = future_costs + lle_vals + switch_penalty
			total_vals[cluster] -= switch_penalty
			future_cost_vals[i,cluster] = np.min(total_vals)

	##compute the best path
	path = np.zeros(T)

	##the first location
	curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:])
	path[0] = curr_location
	DP_start2 = time.time()
	##compute the path
	for i in xrange(T-1):
		j = i+1
		future_costs = future_cost_vals[j,:]
		lle_vals = LLE_node_vals[j,:]
		total_vals = future_costs + lle_vals + switch_penalty
		total_vals[int(path[i])] -= switch_penalty

		path[i+1] = np.argmin(total_vals)

	##return the computed path
	return path

def find_matching(confusion_matrix):
	"""
	returns the perfect matching
	"""
	_,n = confusion_matrix.shape
	path = []
	for i in xrange(n):
		max_val = -1e10
		max_ind = -1
		for j in xrange(n):
			if j in path:
				pass
			else:
				temp = confusion_matrix[i,j]
				if temp > max_val:
					max_val = temp
					max_ind = j
		path.append(max_ind)
	return path

def computeF1Score_delete(num_cluster,matching_algo,actual_clusters,threshold_algo,save_matrix = False):
	"""
	computes the F1 scores and returns a list of values
	"""
	F1_score = np.zeros(num_cluster)
	for cluster in xrange(num_cluster):
		matched_cluster = matching_algo[cluster]
		true_matrix = actual_clusters[cluster]
		estimated_matrix = threshold_algo[matched_cluster]
		TP = 0
		TN = 0
		FP = 0
		FN = 0
		for i in xrange(num_stacked*n):
			for j in xrange(num_stacked*n):
				if estimated_matrix[i,j] == 1 and true_matrix[i,j] != 0:
					TP += 1.0
				elif estimated_matrix[i,j] == 0 and true_matrix[i,j] == 0:
					TN += 1.0
				elif estimated_matrix[i,j] == 1 and true_matrix[i,j] == 0:
					FP += 1.0
				else:
					FN += 1.0
		precision = (TP)/(TP + FP)
		print "cluster #", cluster
		print "TP,TN,FP,FN---------->", (TP,TN,FP,FN)
		recall = TP/(TP + FN)
		f1 = (2*precision*recall)/(precision + recall)
		F1_score[cluster] = f1
	return F1_score

def compute_confusion_matrix(num_clusters,clustered_points_algo, sorted_indices_algo):
	"""
	computes a confusion matrix and returns it
	"""
	seg_len = 50
	true_confusion_matrix = np.zeros([num_clusters,num_clusters])
	for point in xrange(len(clustered_points_algo)):
		cluster = clustered_points_algo[point]

		#CASE E : ABCABC
		num = (int(sorted_indices_algo[point]/seg_len) %num_clusters)
		true_confusion_matrix[num,cluster] += 1

	return true_confusion_matrix

def computeF1_macro(confusion_matrix,matching, num_clusters):
	"""
	computes the macro F1 score
	confusion matrix : requres permutation
	matching according to which matrix must be permuted
	"""
	##Permute the matrix columns
	permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
	for cluster in xrange(num_clusters):
		matched_cluster = matching[cluster]
 		permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
 	##Compute the F1 score for every cluster
 	F1_score = 0
 	for cluster in xrange(num_clusters):
 		TP = permuted_confusion_matrix[cluster,cluster]
 		FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
 		FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
 		precision = TP/(TP + FP)
 		recall = TP/(TP + FN)
 		f1 = stats.hmean([precision,recall])
 		F1_score += f1
 	F1_score /= num_clusters
 	return F1_score

############
##The basic folder to be created
str_NULL = "VW_data_lam_sparse=" + str(lam_sparse)+"cvxpy_iterations"+str(cvxpy_iterations) + "maxClusters=" +str(maxClusters)+"/"

if not os.path.exists(os.path.dirname(str_NULL)):
    try:
        os.makedirs(os.path.dirname(str_NULL))
    except OSError as exc: # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise

def hex_to_rgb(value):
	"""Return (red, green, blue) for the color given as #rrggbb."""
	lv = len(value)
	out = tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))
	out = tuple([x/256.0 for x in out])
	return out
color_list = []
for hex_color in hexadecimal_color_list:
	rgb_color = hex_to_rgb(hex_color)
	color_list.append(rgb_color)
colors = color_list


train_cluster_inverse = {}
log_det_values = {}
computed_covariance = {}
cluster_mean_info = {}
cluster_mean_stacked_info = {}
old_clustered_points = np.zeros(10)
for iters in xrange(maxIters):
	print "\n\n\nITERATION ###", iters
	iter_start =  time.time()
	num_clusters = maxClusters - 1

	if iters == 0:
		## Now splitting up stuff 
		## split1 : Training and Test
		## split2 : Training and Test - different clusters

		time1 = time.time()
		training_percent = 1
		training_idx = np.random.choice(m-num_blocks+1, size=int((m-num_stacked)*training_percent),replace = False )
		##Ensure that the first and the last few points are in
		training_idx = list(training_idx)
		if 0 not in training_idx:
			training_idx.append(0)
		if m - num_stacked  not in training_idx:
			training_idx.append(m-num_stacked)

		training_idx = np.array(training_idx)
		sorted_training_idx = range(len(training_idx))
		# sorted_training_idx = sorted(training_idx)
		##compute the test indices

		##Stack the complete data
		print "stacking training data"
		time_stack_start = time.time()
		complete_Data = np.zeros([m - num_stacked + 1, num_stacked*n])
		len_data = m
		for i in xrange(m - num_stacked + 1):
			idx = i
			for k in xrange(num_stacked):
				if i+k < len_data:
					idx_k = i + k
					complete_Data[i][k*n:(k+1)*n] =  Data[idx_k][0:n]

		##Stack the training data
		complete_D_train = np.zeros([len(training_idx), num_stacked*n])
		len_training = len(training_idx)
		for i in xrange(len(sorted_training_idx)):
			idx = sorted_training_idx[i]
			for k in xrange(num_stacked):
				if i+k < len_training:
					idx_k = sorted_training_idx[i+k]
					complete_D_train[i][k*n:(k+1)*n] =  Data[idx_k][0:n]
		print "completed stacking data"
		time_stack_end = time.time()
		print "It took :", time_stack_end - time_stack_start, "to stack the data"

		actual_start = time.time()
		print "starting to count time from here"
		#####INITIALIZATION!!!
		gmm_start = time.time()
		gmm = mixture.GaussianMixture(n_components=num_clusters, covariance_type="full")
		clustered_points = np.random.choice(num_clusters, size = complete_D_train.shape[0], replace = True)#gmm.predict(complete_D_train)
		gmm_clustered_pts = clustered_points + 0

		gmm_end = time.time()
		print "intialization completed"
		print "gmm took:", gmm_end - gmm_start

		##USE K-means
		print "running K means"
		kmeans_start = time.time()

		clustered_points_kmeans = np.random.choice(num_clusters, size = complete_D_train.shape[0], replace = True)#kmeans.labels_
		kmeans_clustered_pts = np.random.choice(num_clusters, size = complete_D_train.shape[0], replace = True)#kmeans.labels_
		kmeans_end = time.time()
		print "k means took:", kmeans_end - kmeans_start
	
	##Get the train and test points
	train_clusters = collections.defaultdict(list)
	test_clusters = collections.defaultdict(list)
	len_train_clusters = collections.defaultdict(int)
	len_test_clusters = collections.defaultdict(int)

	counter = 0
	for point in range(len(clustered_points)):
		cluster = clustered_points[point]
		train_clusters[cluster].append(point)
		len_train_clusters[cluster] += 1
		counter +=1 



	##train_clusters holds the indices in complete_D_train 
	##for each of the clusters
	for cluster in xrange(num_clusters):
		if len_train_clusters[cluster] != 0:
			indices = train_clusters[cluster]

			print "\n\n\nstarting with cluster:", cluster
			cluster_start = time.time()
			S = np.cov(np.transpose(complete_D_train[indices,:]))
			S_mean = np.mean(complete_D_train[indices,:], axis = 0)
			print "it took about", time.time() - cluster_start,"to get all the training data"
			size_blocks = n
			probSize = num_stacked * size_blocks
			lamb = np.zeros((probSize,probSize)) + lam_sparse
			cov_start = time.time()

			#OPTIMIZATION CODE
			print "starting the OPTIMIZATION"
			opt_start = time.time()
			gvx = TGraphVX()
			theta = semidefinite(probSize,name='theta')
			obj = -log_det(theta) + trace(S*theta)
			gvx.AddNode(0, obj)
			gvx.AddNode(1)
			dummy = Variable(1)
			gvx.AddEdge(0,1, Objective = lamb*dummy + num_stacked*dummy + size_blocks*dummy)
			gvx.Solve(Verbose=False, MaxIters=1000, Rho = 1, EpsAbs = 1e-6, EpsRel = 1e-6)

			opt_end = time.time() 
			print "SOLVER took:", opt_end - opt_start

			#THIS IS THE SOLUTION
			val = gvx.GetNodeValue(0,'theta')
			S_est = upper2Full(val, 0)
			X2 = S_est
			u, _ = np.linalg.eig(S_est)
			cov_out = np.linalg.inv(X2)

			inv_matrix = cov_out

			##Store the log-det, covariance, inverse-covariance, cluster means, stacked means
			log_det_values[num_clusters, cluster] = np.log(np.linalg.det(cov_out))
			computed_covariance[num_clusters,cluster] = cov_out
			cluster_mean_info[num_clusters,cluster] = S_mean[(num_stacked-1)*n:num_stacked*n].reshape([1,n])
			cluster_mean_stacked_info[num_clusters,cluster] = S_mean
			train_cluster_inverse[cluster] = X2

	cluster_norms = list(np.zeros(num_clusters))

	##Computing the norms
	if iters != 0:
		for cluster in xrange(num_clusters):
			cluster_norms[cluster] = (np.linalg.norm(old_computed_covariance[num_clusters,cluster]),cluster)
		sorted_cluster_norms = sorted(cluster_norms,reverse = True)

	##Add a point to the empty clusters 
	##Assumption more non empty clusters than empty ones
	counter = 0
	for cluster in xrange(num_clusters):
		if len_train_clusters[cluster] == 0:
			print "doing robustness stuff one cluster has zero points"
			robust_start = time.time()
			##Add a point to the cluster
			while len_train_clusters[sorted_cluster_norms[counter][1]] == 0:
				counter += 1
				counter = counter % num_clusters

			cluster_selected = sorted_cluster_norms[counter][1]
			break_flag = False
			while not break_flag:
				point_num = random.randint(0,len(clustered_points))
				if clustered_points[point_num] == cluster_selected:
					clustered_points[point_num] = cluster
					# print "old covariances shape", old_computed_covariance[cluster_selected].shape
					computed_covariance[num_clusters,cluster] = old_computed_covariance[num_clusters,cluster_selected]
					cluster_mean_stacked_info[num_clusters,cluster] = complete_D_train[point_num,:]
					cluster_mean_info[num_clusters,cluster] = complete_D_train[point,:][(num_stacked-1)*n:num_stacked*n]
					break_flag = True
			print "done adding stuff"
			counter += 1
			print "it took:",  time.time() - robust_start

	old_train_clusters = train_clusters
	old_computed_covariance = computed_covariance
	print "\n\n\n\n"
	cache_start = time.time()
	inv_matrix_cluster ={}
	log_det_cov_cluster = {}
	for cluster in xrange(num_clusters):
		cov_matrix = computed_covariance[num_clusters,cluster][0:(num_blocks-1)*n,0:(num_blocks-1)*n]
		log_det_cov = np.log(np.linalg.det(cov_matrix))# log(det(sigma2|1))
		inv_matrix_cluster[cluster] = np.linalg.inv(cov_matrix)
		log_det_cov_cluster[cluster] = log_det_cov
 	##Code -----------------------SMOOTHENING
	##For each point compute the LLE 
	print "\n\n"
	LLE_start =  time.time()

	LLE_all_points_clusters = np.zeros([len(clustered_points),num_clusters])
	for point in xrange(len(clustered_points)):
		# print "Point #", point
		if point + num_stacked-1 < complete_D_train.shape[0]:
			for cluster in xrange(num_clusters):
				# print "\nCLuster#", cluster
				cluster_mean = cluster_mean_info[num_clusters,cluster] 
				cluster_mean_stacked = cluster_mean_stacked_info[num_clusters,cluster] 

				x = complete_D_train[point,:] - cluster_mean_stacked[0:(num_blocks-1)*n]
				cov_matrix = computed_covariance[num_clusters,cluster][0:(num_blocks-1)*n,0:(num_blocks-1)*n]
				inv_cov_matrix = inv_matrix_cluster[cluster]#np.linalg.inv(cov_matrix)
				log_det_cov = log_det_cov_cluster[cluster]#np.log(np.linalg.det(cov_matrix))# log(det(sigma2|1))
				lle = np.dot(   x.reshape([1,(num_blocks-1)*n]), np.dot(inv_cov_matrix,x.reshape([n*(num_blocks-1),1]))  ) + log_det_cov
				LLE_all_points_clusters[point,cluster] = lle
	##Update cluster points - using NEW smoothening
	DP_start = time.time()
	clustered_points = updateClusters(LLE_all_points_clusters,switch_penalty = switch_penalty)
	print "Total time for the DYNAMIC PROGRAMMING ALGORITHM:", time.time() - LLE_start



	##compute the F1 macro scores
	f1_EM_tr = 0#computeF1_macro(train_confusion_matrix_EM,matching_EM,num_clusters)
	f1_GMM_tr = 0#computeF1_macro(train_confusion_matrix_GMM,matching_GMM,num_clusters)
	f1_kmeans_tr = 0#computeF1_macro(train_confusion_matrix_kmeans,matching_Kmeans,num_clusters)

	print "\n\n\n"
	print "\n\n\nThe TOTAL ITERATION TIME (from GMM Initialization):", time.time() - actual_start
	print "\n\n"
	if np.array_equal(old_clustered_points,clustered_points):
		print "\n\n\n\nCONVERGED!!! BREAKING EARLY!!!"
		break
	old_clustered_points = clustered_points